import warnings
warnings.filterwarnings("ignore")
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter("ignore", ConvergenceWarning)
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# Library to split data
from sklearn.model_selection import train_test_split
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# To build model for prediction
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
precision_recall_curve,
roc_curve,
)
Loan = pd.read_csv("Loan_Modelling.csv")
#Making a copy to preserve the data set from corruption during processing
df = Loan.copy()
df1 = df.copy()
#checking the first 5 rows
df.head()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
#Checking the last 5 rows
df.tail()
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
#Checking rows and columns
df.shape
(5000, 14)
#Checking for duplicate values
df[df.duplicated()].count()
ID 0 Age 0 Experience 0 Income 0 ZIPCode 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 dtype: int64
#Looking at a sampel of rows to check for issues
df.sample(n=10)
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2747 | 2748 | 38 | 12 | 30 | 91765 | 2 | 1.40 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2508 | 2509 | 40 | 15 | 63 | 93407 | 3 | 3.00 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3777 | 3778 | 62 | 37 | 98 | 94706 | 1 | 0.90 | 1 | 151 | 0 | 0 | 0 | 1 | 0 |
| 3171 | 3172 | 39 | 12 | 62 | 91910 | 3 | 2.33 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2797 | 2798 | 65 | 39 | 53 | 94608 | 1 | 2.50 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3837 | 3838 | 44 | 19 | 40 | 92350 | 4 | 0.00 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 394 | 395 | 33 | 9 | 80 | 91311 | 4 | 3.40 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
| 1305 | 1306 | 32 | 6 | 28 | 94025 | 2 | 0.30 | 2 | 88 | 0 | 0 | 0 | 1 | 0 |
| 4078 | 4079 | 36 | 12 | 58 | 91320 | 1 | 3.60 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3420 | 3421 | 66 | 41 | 114 | 94305 | 1 | 0.80 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
#Checking value types
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null int64 10 Securities_Account 5000 non-null int64 11 CD_Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.104600 | 11.467954 | -3.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Education | 5000.0 | 1.881000 | 0.839869 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal_Loan | 5000.0 | 0.096000 | 0.294621 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities_Account | 5000.0 | 0.104400 | 0.305809 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD_Account | 5000.0 | 0.060400 | 0.238250 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 5000.0 | 0.596800 | 0.490589 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 5000.0 | 0.294000 | 0.455637 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
## Converting the data type of categorical features to 'category'
cat_cols = [
"Education",
"Personal_Loan",
"Securities_Account",
"CD_Account",
"Online",
"CreditCard",
]
df[cat_cols] = df[cat_cols].astype("category")
#Making sure the code above worked.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null category 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null category 10 Securities_Account 5000 non-null category 11 CD_Account 5000 non-null category 12 Online 5000 non-null category 13 CreditCard 5000 non-null category dtypes: category(6), float64(1), int64(7) memory usage: 342.7 KB
#Checking unique values in eduation
df['Education'].unique()
[1, 2, 3] Categories (3, int64): [1, 2, 3]
#Replacing the numbers in the education column with the corresponding level of education.
df["Education"].replace(1, "Undergraduate", inplace=True)
df["Education"].replace(2, "Graduate", inplace=True)
df["Education"].replace(3, "Professional", inplace=True)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.104600 | 11.467954 | -3.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
# counting the number of missing values per row
df.isnull().sum(axis=1).value_counts()
0 5000 dtype: int64
df.isnull().sum().sort_values(ascending=False)
ID 0 Age 0 Experience 0 Income 0 ZIPCode 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 dtype: int64
df.duplicated().sum()
0
df.isnull().values.any() #checking for null values
False
#Looking at unique values
print(df.apply(lambda col: col.unique()))
ID [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... Age [25, 45, 39, 35, 37, 53, 50, 34, 65, 29, 48, 5... Experience [1, 19, 15, 9, 8, 13, 27, 24, 10, 39, 5, 23, 3... Income [49, 34, 11, 100, 45, 29, 72, 22, 81, 180, 105... ZIPCode [91107, 90089, 94720, 94112, 91330, 92121, 917... Family [4, 3, 1, 2] CCAvg [1.6, 1.5, 1.0, 2.7, 0.4, 0.3, 0.6, 8.9, 2.4, ... Education ['Undergraduate', 'Graduate', 'Professional'] ... Mortgage [0, 155, 104, 134, 111, 260, 163, 159, 97, 122... Personal_Loan [0, 1] Categories (2, int64): [0, 1] Securities_Account [1, 0] Categories (2, int64): [1, 0] CD_Account [0, 1] Categories (2, int64): [0, 1] Online [0, 1] Categories (2, int64): [0, 1] CreditCard [0, 1] Categories (2, int64): [0, 1] dtype: object
#Replacing negative values in expereince with absolute values; in case of a data input error
df['Experience'] = df['Experience'].abs()
#Making sure no negative values remmain in the experience column
df[df["Experience"] < 0]["Experience"].unique()
array([], dtype=int64)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.134600 | 11.415189 | 0.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
#Looking at value counts
for i in cat_cols:
print("Unique values in", i, "are :")
print(df[i].value_counts())
print("*" * 50)
Unique values in Education are : Undergraduate 2096 Professional 1501 Graduate 1403 Name: Education, dtype: int64 ************************************************** Unique values in Personal_Loan are : 0 4520 1 480 Name: Personal_Loan, dtype: int64 ************************************************** Unique values in Securities_Account are : 0 4478 1 522 Name: Securities_Account, dtype: int64 ************************************************** Unique values in CD_Account are : 0 4698 1 302 Name: CD_Account, dtype: int64 ************************************************** Unique values in Online are : 1 2984 0 2016 Name: Online, dtype: int64 ************************************************** Unique values in CreditCard are : 0 3530 1 1470 Name: CreditCard, dtype: int64 **************************************************
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
histogram_boxplot(df, "Age")
histogram_boxplot(df, "Experience")
histogram_boxplot(df, "Income")
histogram_boxplot(df, "Mortgage")
histogram_boxplot(df, "Family")
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(df, "Education", perc=True)
#Below we see that most customers are a 1 person family; though there is no extreme differences.
labeled_barplot(df, "Family", perc=True)
# checking the number of uniques in the zip code
df["ZIPCode"].nunique()
467
#Minimizing the amount of unique values in the zip code column
df["ZIPCode"] = df["ZIPCode"].astype(str)
print(
"Number of unique values if we take first two digits of ZIPCode: ",
df["ZIPCode"].str[0:2].nunique(),
)
df["ZIPCode"] = df["ZIPCode"].str[0:2]
Number of unique values if we take first two digits of ZIPCode: 7
labeled_barplot(df, "ZIPCode", perc=True)
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Greys")
plt.show()
sns.pairplot(data=df,hue="Personal_Loan",)
plt.show()
### function to plot distributions wrt target
def distribution_plot_wrt_target(data, predictor, target):
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
stat="density",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
stat="density",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
distribution_plot_wrt_target(df, "Age", "Personal_Loan")
distribution_plot_wrt_target(df, "Income", "Personal_Loan")
distribution_plot_wrt_target(df, "Experience", "Personal_Loan")
distribution_plot_wrt_target(df, "CCAvg", "Personal_Loan")
distribution_plot_wrt_target(df, "Family", "Personal_Loan")
### Function to plot stacked bar charts for categorical columns
def stacked_plot(x):
sns.set()
## crosstab
tab1 = pd.crosstab(x, df["Personal_Loan"], margins=True).sort_values(
by=" >50K", ascending=False
)
print(tab1)
print("-" * 120)
## visualising the cross tab
tab = pd.crosstab(x, df["Personal_Loan"], normalize="index").sort_values(
by=" >50K", ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(17, 7))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = df[predictor].nunique()
sorter = df[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], df[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(df, "Education", "Personal_Loan")
Personal_Loan 0 1 All Education All 4520 480 5000 Professional 1296 205 1501 Graduate 1221 182 1403 Undergraduate 2003 93 2096 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Securities_Account", "Personal_Loan")
Personal_Loan 0 1 All Securities_Account All 4520 480 5000 0 4058 420 4478 1 462 60 522 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "CD_Account", "Personal_Loan")
Personal_Loan 0 1 All CD_Account All 4520 480 5000 0 4358 340 4698 1 162 140 302 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Online", "Personal_Loan")
Personal_Loan 0 1 All Online All 4520 480 5000 1 2693 291 2984 0 1827 189 2016 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "CreditCard", "Personal_Loan")
Personal_Loan 0 1 All CreditCard All 4520 480 5000 0 3193 337 3530 1 1327 143 1470 ------------------------------------------------------------------------------------------------------------------------
numerical_col = df.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# functions to treat outliers by flooring and capping
def treat_outliers(df, col):
"""
Treats outliers in a variable
df: dataframe
col: dataframe column
"""
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker)
return df
def treat_outliers_all(df, col_list):
"""
Treat outliers in a list of variables
df: dataframe
col_list: list of dataframe columns
"""
for c in col_list:
df = treat_outliers(df, c)
return df
numerical_col = df.select_dtypes(include=np.number).columns.tolist()
df = treat_outliers_all(df, numerical_col)
# let's look at box plot to see if outliers have been treated or not
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
#Creating training and test sets
X = df.drop(["Personal_Loan", "ID"], axis=1)
Y = df["Personal_Loan"]
X = pd.get_dummies(X, drop_first=True)
# Splitting data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1
)
#Showig shape of training and test sets
print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Shape of Training set : (3500, 18) Shape of test set : (1500, 18) Percentage of classes in training set: 0 0.905429 1 0.094571 Name: Personal_Loan, dtype: float64 Percentage of classes in test set: 0 0.900667 1 0.099333 Name: Personal_Loan, dtype: float64
# defining a function to compute different metrics to check performance of a classification model built using statsmodels
def model_performance_classification_statsmodels(
model, predictors, target, threshold=0.5
):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
# checking which probabilities are greater than threshold
pred_temp = model.predict(predictors) > threshold
# rounding off the above values to get classes
pred = np.round(pred_temp)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
# defining a function to plot the confusion_matrix of a classification model
def confusion_matrix_statsmodels(model, predictors, target, threshold=0.5):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
y_pred = model.predict(predictors) > threshold
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
#Logistic Regression with SKlearn
lg = LogisticRegression(solver="newton-cg", random_state=1)
model = lg.fit(X_train, y_train)
# predicting on training set
y_pred_train = lg.predict(X_train)
print("Training set performance:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Precision:", precision_score(y_train, y_pred_train))
print("Recall:", recall_score(y_train, y_pred_train))
print("F1:", f1_score(y_train, y_pred_train))
Training set performance: Accuracy: 0.9605714285714285 Precision: 0.8784313725490196 Recall: 0.676737160120846 F1: 0.7645051194539249
# predicting on the test set
y_pred_test = lg.predict(X_test)
print("Test set performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Precision:", precision_score(y_test, y_pred_test))
print("Recall:", recall_score(y_test, y_pred_test))
print("F1:", f1_score(y_test, y_pred_test))
Test set performance: Accuracy: 0.956 Precision: 0.8807339449541285 Recall: 0.6442953020134228 F1: 0.7441860465116278
X = df.drop(["Personal_Loan", "ID"], axis=1)
Y = df["Personal_Loan"]
X = pd.get_dummies(X, drop_first=True)
# adding constant
X = sm.add_constant(X)
# Splitting data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1
)
# fitting logistic regression model
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
print(lg.summary())
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3481
Method: MLE Df Model: 18
Date: Sat, 31 Jul 2021 Pseudo R-squ.: 0.6624
Time: 02:32:58 Log-Likelihood: -369.87
converged: True LL-Null: -1095.5
Covariance Type: nonrobust LLR p-value: 1.463e-297
==========================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
const -14.5530 2.190 -6.644 0.000 -18.846 -10.260
Age -0.0016 0.079 -0.020 0.984 -0.157 0.153
Experience 0.0093 0.079 0.118 0.906 -0.146 0.164
Income 0.0623 0.004 15.799 0.000 0.055 0.070
Family 0.7001 0.101 6.936 0.000 0.502 0.898
CCAvg 0.4554 0.073 6.210 0.000 0.312 0.599
Mortgage 0.0014 0.001 1.457 0.145 -0.000 0.003
ZIPCode_91 -0.2221 0.376 -0.591 0.555 -0.959 0.515
ZIPCode_92 0.1721 0.344 0.501 0.616 -0.501 0.845
ZIPCode_93 0.1246 0.424 0.294 0.769 -0.706 0.955
ZIPCode_94 -0.1017 0.320 -0.318 0.751 -0.730 0.526
ZIPCode_95 -0.0944 0.358 -0.263 0.792 -0.797 0.608
ZIPCode_96 -3.4974 7.250 -0.482 0.630 -17.707 10.712
Education_Graduate 4.1339 0.344 12.028 0.000 3.460 4.807
Education_Professional 4.2020 0.337 12.456 0.000 3.541 4.863
Securities_Account_1 -1.1192 0.417 -2.687 0.007 -1.936 -0.303
CD_Account_1 3.8585 0.447 8.640 0.000 2.983 4.734
Online_1 -0.6888 0.208 -3.317 0.001 -1.096 -0.282
CreditCard_1 -1.0907 0.270 -4.046 0.000 -1.619 -0.562
==========================================================================================
Possibly complete quasi-separation: A fraction 0.15 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
print("Training performance:")
model_performance_classification_statsmodels(lg, X_train, y_train)
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.964571 | 0.716012 | 0.88764 | 0.792642 |
print("Test performance:")
model_performance_classification_statsmodels(lg, X_test, y_test)
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.958 | 0.671141 | 0.877193 | 0.760456 |
*Positive values of the coefficient shows that the probability of customer getting a Personal Loan increases with the increase of corresponding attribute value.
*Negative values of the coefficient show that the probability of customer getting a Personal Loan decreases with the decrease of corresponding attribute value.
*p-value of a variable indicates if the variable is significant or not. If we consider the significance level to be 0.05 (5%), then any variable with a p-value less than 0.05 would be considered significant.
*But these variables might contain multicollinearity, which will affect the p-values.
*We will have to remove multicollinearity from the data to get reliable coefficients and p-values.
*There are different ways of detecting (or testing) multi-collinearity, one such way is the Variation Inflation Factor.
#Code to check for Multicollinearity
vif_series = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
dtype=float,
)
print("Series before feature selection: \n\n{}\n".format(vif_series))
Series before feature selection: const 454.068364 Age 91.567908 Experience 91.513993 Income 1.825494 Family 1.047016 CCAvg 1.708615 Mortgage 1.020947 ZIPCode_91 1.624618 ZIPCode_92 1.928396 ZIPCode_93 1.453881 ZIPCode_94 2.225582 ZIPCode_95 1.819881 ZIPCode_96 1.045006 Education_Graduate 1.293715 Education_Professional 1.336017 Securities_Account_1 1.144960 CD_Account_1 1.359284 Online_1 1.042024 CreditCard_1 1.110630 dtype: float64
X_train1 = X_train.drop("Age", axis=1)
vif_series2 = pd.Series(
[variance_inflation_factor(X_train1.values, i) for i in range(X_train1.shape[1])],
index=X_train1.columns,
)
print("Series before feature selection: \n\n{}\n".format(vif_series2))
Series before feature selection: const 24.329131 Experience 1.012850 Income 1.820306 Family 1.046658 CCAvg 1.703620 Mortgage 1.020926 ZIPCode_91 1.624332 ZIPCode_92 1.927172 ZIPCode_93 1.453844 ZIPCode_94 2.225575 ZIPCode_95 1.819785 ZIPCode_96 1.044701 Education_Graduate 1.280624 Education_Professional 1.259606 Securities_Account_1 1.144780 CD_Account_1 1.358857 Online_1 1.041917 CreditCard_1 1.110582 dtype: float64
logit1 = sm.Logit(y_train, X_train1.astype(float))
lg1 = logit1.fit()
print("Training performance:")
log_reg_model_train_perf = model_performance_classification_statsmodels(
lg1, X_train1, y_train
)
log_reg_model_train_perf
Optimization terminated successfully.
Current function value: 0.105677
Iterations 11
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.964571 | 0.716012 | 0.88764 | 0.792642 |
print(lg1.summary())
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3482
Method: MLE Df Model: 17
Date: Sat, 31 Jul 2021 Pseudo R-squ.: 0.6624
Time: 02:33:07 Log-Likelihood: -369.87
converged: True LL-Null: -1095.5
Covariance Type: nonrobust LLR p-value: 1.559e-298
==========================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
const -14.5926 0.847 -17.237 0.000 -16.252 -12.933
Experience 0.0078 0.008 0.918 0.359 -0.009 0.024
Income 0.0623 0.004 15.876 0.000 0.055 0.070
Family 0.7001 0.101 6.936 0.000 0.502 0.898
CCAvg 0.4553 0.073 6.215 0.000 0.312 0.599
Mortgage 0.0014 0.001 1.457 0.145 -0.000 0.003
ZIPCode_91 -0.2223 0.376 -0.591 0.554 -0.959 0.515
ZIPCode_92 0.1718 0.343 0.501 0.617 -0.501 0.845
ZIPCode_93 0.1243 0.423 0.294 0.769 -0.705 0.954
ZIPCode_94 -0.1019 0.320 -0.318 0.750 -0.730 0.526
ZIPCode_95 -0.0947 0.358 -0.264 0.791 -0.796 0.607
ZIPCode_96 -3.4976 7.249 -0.482 0.629 -17.706 10.710
Education_Graduate 4.1336 0.344 12.033 0.000 3.460 4.807
Education_Professional 4.2011 0.334 12.568 0.000 3.546 4.856
Securities_Account_1 -1.1190 0.416 -2.687 0.007 -1.935 -0.303
CD_Account_1 3.8586 0.447 8.640 0.000 2.983 4.734
Online_1 -0.6887 0.208 -3.318 0.001 -1.096 -0.282
CreditCard_1 -1.0905 0.269 -4.048 0.000 -1.618 -0.563
==========================================================================================
Possibly complete quasi-separation: A fraction 0.15 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
X_train2 = X_train1.drop(
["ZIPCode_91","ZIPCode_92","ZIPCode_93","ZIPCode_94","ZIPCode_95","ZIPCode_96"], axis=1
)
logit2 = sm.Logit(y_train, X_train2.astype(float))
lg2 = logit2.fit()
print(lg2.summary())
Optimization terminated successfully.
Current function value: 0.106011
Iterations 10
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3488
Method: MLE Df Model: 11
Date: Sat, 31 Jul 2021 Pseudo R-squ.: 0.6613
Time: 02:33:09 Log-Likelihood: -371.04
converged: True LL-Null: -1095.5
Covariance Type: nonrobust LLR p-value: 3.476e-304
==========================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
const -14.5898 0.819 -17.825 0.000 -16.194 -12.986
Experience 0.0080 0.008 0.950 0.342 -0.009 0.025
Income 0.0620 0.004 15.936 0.000 0.054 0.070
Family 0.7018 0.101 6.976 0.000 0.505 0.899
CCAvg 0.4576 0.073 6.284 0.000 0.315 0.600
Mortgage 0.0015 0.001 1.502 0.133 -0.000 0.003
Education_Graduate 4.0943 0.340 12.048 0.000 3.428 4.760
Education_Professional 4.1945 0.333 12.577 0.000 3.541 4.848
Securities_Account_1 -1.1120 0.414 -2.685 0.007 -1.924 -0.300
CD_Account_1 3.8186 0.442 8.642 0.000 2.953 4.685
Online_1 -0.6946 0.207 -3.358 0.001 -1.100 -0.289
CreditCard_1 -1.0735 0.269 -3.996 0.000 -1.600 -0.547
==========================================================================================
Possibly complete quasi-separation: A fraction 0.14 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
print("Training performance:")
model_performance_classification_statsmodels(lg2, X_train2, y_train)
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.964 | 0.716012 | 0.881041 | 0.79 |
X_train3 = X_train2.drop(
["Mortgage"], axis=1
)
logit3 = sm.Logit(y_train, X_train3.astype(float))
lg3 = logit3.fit()
print(lg3.summary())
Optimization terminated successfully.
Current function value: 0.106329
Iterations 10
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3489
Method: MLE Df Model: 10
Date: Sat, 31 Jul 2021 Pseudo R-squ.: 0.6603
Time: 02:33:11 Log-Likelihood: -372.15
converged: True LL-Null: -1095.5
Covariance Type: nonrobust LLR p-value: 8.507e-305
==========================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
const -14.4761 0.811 -17.860 0.000 -16.065 -12.887
Experience 0.0071 0.008 0.839 0.401 -0.009 0.024
Income 0.0621 0.004 16.007 0.000 0.055 0.070
Family 0.7023 0.101 6.973 0.000 0.505 0.900
CCAvg 0.4514 0.073 6.218 0.000 0.309 0.594
Education_Graduate 4.0734 0.339 12.014 0.000 3.409 4.738
Education_Professional 4.1791 0.333 12.560 0.000 3.527 4.831
Securities_Account_1 -1.0885 0.412 -2.640 0.008 -1.897 -0.280
CD_Account_1 3.8149 0.442 8.640 0.000 2.950 4.680
Online_1 -0.6890 0.207 -3.335 0.001 -1.094 -0.284
CreditCard_1 -1.0710 0.268 -4.001 0.000 -1.596 -0.546
==========================================================================================
Possibly complete quasi-separation: A fraction 0.14 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
print("Training performance:")
model_performance_classification_statsmodels(lg3, X_train3, y_train)
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.965429 | 0.725076 | 0.888889 | 0.798669 |
#Dropping experience because of the p value greater than .05
X_train4 = X_train3.drop(
["Experience"], axis=1
)
logit4 = sm.Logit(y_train, X_train4.astype(float))
lg4 = logit4.fit()
print(lg4.summary())
Optimization terminated successfully.
Current function value: 0.106429
Iterations 10
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3490
Method: MLE Df Model: 9
Date: Sat, 31 Jul 2021 Pseudo R-squ.: 0.6600
Time: 02:33:14 Log-Likelihood: -372.50
converged: True LL-Null: -1095.5
Covariance Type: nonrobust LLR p-value: 9.265e-306
==========================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
const -14.2962 0.777 -18.395 0.000 -15.819 -12.773
Income 0.0620 0.004 16.021 0.000 0.054 0.070
Family 0.6992 0.101 6.951 0.000 0.502 0.896
CCAvg 0.4457 0.072 6.158 0.000 0.304 0.588
Education_Graduate 4.0691 0.339 12.007 0.000 3.405 4.733
Education_Professional 4.1664 0.332 12.554 0.000 3.516 4.817
Securities_Account_1 -1.0882 0.413 -2.633 0.008 -1.898 -0.278
CD_Account_1 3.8308 0.442 8.669 0.000 2.965 4.697
Online_1 -0.6911 0.206 -3.348 0.001 -1.096 -0.287
CreditCard_1 -1.0718 0.268 -4.006 0.000 -1.596 -0.547
==========================================================================================
Possibly complete quasi-separation: A fraction 0.14 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
#Looking at performacne after all variables with a p value greater than .05 have been dropped.
print("Training performance:")
model_performance_classification_statsmodels(lg4, X_train4, y_train)
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.963714 | 0.719033 | 0.875 | 0.789386 |
print("Test performance:")
model_performance_classification_statsmodels(lg, X_test, y_test)
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.958 | 0.671141 | 0.877193 | 0.760456 |
# creating confusion matrix
confusion_matrix_statsmodels(lg4, X_train4, y_train)
logit_roc_auc_train = roc_auc_score(y_train, lg4.predict(X_train4))
fpr, tpr, thresholds = roc_curve(y_train, lg4.predict(X_train4))
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
# Optimal threshold as per AUC-ROC curve
# The optimal cut off would be where tpr is high and fpr is low
fpr, tpr, thresholds = roc_curve(y_train, lg4.predict(X_train4))
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold_auc_roc = thresholds[optimal_idx]
print(optimal_threshold_auc_roc)
0.07995881985950803
# creating confusion matrix
confusion_matrix_statsmodels(
lg4, X_train4, y_train, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_train_perf_threshold_auc_roc = (
model_performance_classification_statsmodels(
lg4, X_train4, y_train, threshold=optimal_threshold_auc_roc
)
)
print("Training performance:")
log_reg_model_train_perf_threshold_auc_roc
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.897714 | 0.912387 | 0.478605 | 0.627859 |
y_scores = lg4.predict(X_train4)
prec, rec, tre = precision_recall_curve(
y_train,
y_scores,
)
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g--", label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.figure(figsize=(10, 7))
plot_prec_recall_vs_tresh(prec, rec, tre)
plt.show()
# setting the threshold
optimal_threshold_curve = 0.32
# creating confusion matrix
confusion_matrix_statsmodels(lg4, X_train4, y_train, threshold=optimal_threshold_curve)
log_reg_model_train_perf_threshold_curve = model_performance_classification_statsmodels(
lg4, X_train4, y_train, threshold=optimal_threshold_curve
)
print("Training performance:")
log_reg_model_train_perf_threshold_curve
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.957714 | 0.797583 | 0.765217 | 0.781065 |
X_test2 = X_test[list(X_train4.columns)]
# creating confusion matrix
confusion_matrix_statsmodels(lg4, X_test2, y_test)
log_reg_model_test_perf = model_performance_classification_statsmodels(
lg4, X_test2, y_test
)
print("Test performance:")
log_reg_model_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.956667 | 0.66443 | 0.868421 | 0.752852 |
logit_roc_auc_train = roc_auc_score(y_test, lg4.predict(X_test2))
fpr, tpr, thresholds = roc_curve(y_test, lg4.predict(X_test2))
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
# creating confusion matrix
confusion_matrix_statsmodels(lg4, X_test2, y_test, threshold=optimal_threshold_auc_roc)
# checking model performance for this model
log_reg_model_test_perf_threshold_auc_roc = (
model_performance_classification_statsmodels(
lg4, X_test2, y_test, threshold=optimal_threshold_auc_roc
)
)
print("Test performance:")
log_reg_model_test_perf_threshold_auc_roc
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.901333 | 0.892617 | 0.501887 | 0.642512 |
# creating confusion matrix
confusion_matrix_statsmodels(lg4, X_test2, y_test, threshold=optimal_threshold_curve)
log_reg_model_test_perf_threshold_curve = model_performance_classification_statsmodels(
lg4, X_test2, y_test, threshold=optimal_threshold_curve
)
print("Test performance:")
log_reg_model_test_perf_threshold_curve
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.953333 | 0.738255 | 0.780142 | 0.758621 |
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_model_train_perf_threshold_auc_roc.T,
log_reg_model_train_perf_threshold_curve.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.13 Threshold",
"Logistic Regression-0.32 Threshold",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Logistic Regression sklearn | Logistic Regression-0.13 Threshold | Logistic Regression-0.32 Threshold | |
|---|---|---|---|
| Accuracy | 0.964571 | 0.897714 | 0.957714 |
| Recall | 0.716012 | 0.912387 | 0.797583 |
| Precision | 0.887640 | 0.478605 | 0.765217 |
| F1 | 0.792642 | 0.627859 | 0.781065 |
# testing performance comparison
models_test_comp_df = pd.concat(
[
log_reg_model_test_perf.T,
log_reg_model_test_perf_threshold_auc_roc.T,
log_reg_model_test_perf_threshold_curve.T,
],
axis=1,
)
models_test_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.11 Threshold",
"Logistic Regression-0.25 Threshold",
]
print("Test set performance comparison:")
models_test_comp_df
Test set performance comparison:
| Logistic Regression sklearn | Logistic Regression-0.11 Threshold | Logistic Regression-0.25 Threshold | |
|---|---|---|---|
| Accuracy | 0.956667 | 0.901333 | 0.953333 |
| Recall | 0.664430 | 0.892617 | 0.738255 |
| Precision | 0.868421 | 0.501887 | 0.780142 |
| F1 | 0.752852 | 0.642512 | 0.758621 |
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
model = DecisionTreeClassifier(criterion="gini",class_weight={0:0.15,1:0.85},random_state=1)
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=1)
confusion_matrix_statsmodels(model, X_train, y_train)
#Checking test performance
make_confusion_matrix(model,y_test)
decision_tree_perf_train = model_performance_classification_sklearn(
model, X_train, y_train
)
decision_tree_perf_train
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
y_train.value_counts(1)
0 0.905429 1 0.094571 Name: Personal_Loan, dtype: float64
feature_names = list(X_train.columns)
print(feature_names)
['const', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage', 'ZIPCode_91', 'ZIPCode_92', 'ZIPCode_93', 'ZIPCode_94', 'ZIPCode_95', 'ZIPCode_96', 'Education_Graduate', 'Education_Professional', 'Securities_Account_1', 'CD_Account_1', 'Online_1', 'CreditCard_1']
Insights:
True Positives:
*Reality: A customer got a Personal Loan
*Model predicted: A customer would get a Personal Loan
*Outcome: The model is good.
True Negatives:
*Reality: A customer did not get a Personal Loan
*Model predicted: The customer would not get a Personal Loan
*Outcome: The business is unaffected.
False Positives:
*Reality: A customer did not get a Personal Loan
*Model predicted: The would get a Personal Loan
*Outcome: The team which is targeting the potential customers will be wasting their resources on the people/customers who will not be contributing to the revenue.
False Negatives:
*Reality: A customer got a Personal Loan.
*Model predicted: The customer would not get a Personal Loan
*Outcome: The customer contributed to revenue but was not targeted by ads or advertisement. This results in a loss of revenue; if these customers were targeted, the bank could have sold more Personal Loans.
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
get_recall_score(model)
Recall on training set : 1.0 Recall on test set : 0.8456375838926175
#Visualizing Decision Tree
plt.figure(figsize=(20, 30))
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(model, feature_names=feature_names, show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account_1 <= 0.50 | | | |--- CCAvg <= 3.95 | | | | |--- Income <= 81.50 | | | | | |--- Age <= 36.50 | | | | | | |--- Family <= 3.50 | | | | | | | |--- CCAvg <= 3.50 | | | | | | | | |--- Experience <= 9.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Experience > 9.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- CCAvg > 3.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family > 3.50 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | |--- Age > 36.50 | | | | | | |--- ZIPCode_91 <= 0.50 | | | | | | | |--- Education_Professional <= 0.50 | | | | | | | | |--- weights: [4.20, 0.00] class: 0 | | | | | | | |--- Education_Professional > 0.50 | | | | | | | | |--- weights: [1.95, 0.00] class: 0 | | | | | | |--- ZIPCode_91 > 0.50 | | | | | | | |--- Education_Professional <= 0.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Education_Professional > 0.50 | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | |--- Income > 81.50 | | | | | |--- Mortgage <= 152.00 | | | | | | |--- Securities_Account_1 <= 0.50 | | | | | | | |--- CCAvg <= 3.05 | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | |--- CCAvg > 3.05 | | | | | | | | |--- CCAvg <= 3.85 | | | | | | | | | |--- Income <= 93.50 | | | | | | | | | | |--- Age <= 45.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- Age > 45.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- Income > 93.50 | | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- CCAvg > 3.85 | | | | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | | |--- Securities_Account_1 > 0.50 | | | | | | | |--- Age <= 37.00 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Age > 37.00 | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | |--- Mortgage > 152.00 | | | | | | |--- Mortgage <= 168.00 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Mortgage > 168.00 | | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | |--- CCAvg > 3.95 | | | | |--- weights: [6.75, 0.00] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- ZIPCode_91 <= 0.50 | | | | |--- weights: [0.00, 6.80] class: 1 | | | |--- ZIPCode_91 > 0.50 | | | | |--- weights: [0.15, 0.00] class: 0 |--- Income > 98.50 | |--- Family <= 2.50 | | |--- Education_Professional <= 0.50 | | | |--- Education_Graduate <= 0.50 | | | | |--- Income <= 100.00 | | | | | |--- CCAvg <= 4.20 | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | |--- CCAvg > 4.20 | | | | | | |--- Securities_Account_1 <= 0.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Securities_Account_1 > 0.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | |--- Income > 100.00 | | | | | |--- Income <= 103.50 | | | | | | |--- Securities_Account_1 <= 0.50 | | | | | | | |--- weights: [2.10, 0.00] class: 0 | | | | | | |--- Securities_Account_1 > 0.50 | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- Income > 103.50 | | | | | | |--- ZIPCode_95 <= 0.50 | | | | | | | |--- weights: [55.65, 0.00] class: 0 | | | | | | |--- ZIPCode_95 > 0.50 | | | | | | | |--- weights: [9.30, 0.00] class: 0 | | | |--- Education_Graduate > 0.50 | | | | |--- Income <= 110.00 | | | | | |--- weights: [1.80, 0.00] class: 0 | | | | |--- Income > 110.00 | | | | | |--- Income <= 116.50 | | | | | | |--- Mortgage <= 126.25 | | | | | | | |--- Age <= 48.50 | | | | | | | | |--- Income <= 114.50 | | | | | | | | | |--- Income <= 113.00 | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | | |--- Income > 113.00 | | | | | | | | | | |--- ZIPCode_94 <= 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- ZIPCode_94 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Income > 114.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Age > 48.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Mortgage > 126.25 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | |--- Income > 116.50 | | | | | | |--- weights: [0.00, 45.05] class: 1 | | |--- Education_Professional > 0.50 | | | |--- Income <= 116.50 | | | | |--- CCAvg <= 1.10 | | | | | |--- weights: [1.95, 0.00] class: 0 | | | | |--- CCAvg > 1.10 | | | | | |--- Age <= 41.50 | | | | | | |--- ZIPCode_94 <= 0.50 | | | | | | | |--- weights: [1.20, 0.00] class: 0 | | | | | | |--- ZIPCode_94 > 0.50 | | | | | | | |--- Mortgage <= 74.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Mortgage > 74.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- Age > 41.50 | | | | | | |--- Income <= 100.00 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Income > 100.00 | | | | | | | |--- CCAvg <= 1.85 | | | | | | | | |--- ZIPCode_93 <= 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- ZIPCode_93 > 0.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- CCAvg > 1.85 | | | | | | | | |--- weights: [0.00, 4.25] class: 1 | | | |--- Income > 116.50 | | | | |--- weights: [0.00, 52.70] class: 1 | |--- Family > 2.50 | | |--- Income <= 113.50 | | | |--- CCAvg <= 2.75 | | | | |--- Income <= 106.50 | | | | | |--- weights: [3.90, 0.00] class: 0 | | | | |--- Income > 106.50 | | | | | |--- Experience <= 3.50 | | | | | | |--- weights: [1.50, 0.00] class: 0 | | | | | |--- Experience > 3.50 | | | | | | |--- Family <= 3.50 | | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | | | |--- Family > 3.50 | | | | | | | |--- Experience <= 34.00 | | | | | | | | |--- Experience <= 9.50 | | | | | | | | | |--- CCAvg <= 2.35 | | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | | |--- CCAvg > 2.35 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Experience > 9.50 | | | | | | | | | |--- weights: [0.00, 4.25] class: 1 | | | | | | | |--- Experience > 34.00 | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | |--- CCAvg > 2.75 | | | | |--- Age <= 57.00 | | | | | |--- CCAvg <= 4.85 | | | | | | |--- weights: [0.00, 10.20] class: 1 | | | | | |--- CCAvg > 4.85 | | | | | | |--- Age <= 48.00 | | | | | | | |--- Age <= 36.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Age > 36.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Age > 48.00 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 57.00 | | | | | |--- CCAvg <= 3.90 | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | |--- CCAvg > 3.90 | | | | | | |--- weights: [0.45, 0.00] class: 0 | | |--- Income > 113.50 | | | |--- Age <= 66.00 | | | | |--- Income <= 116.50 | | | | | |--- CCAvg <= 2.50 | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | |--- CCAvg > 2.50 | | | | | | |--- Age <= 60.50 | | | | | | | |--- weights: [0.00, 5.10] class: 1 | | | | | | |--- Age > 60.50 | | | | | | | |--- ZIPCode_94 <= 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- ZIPCode_94 > 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Income > 116.50 | | | | | |--- weights: [0.00, 130.90] class: 1 | | | |--- Age > 66.00 | | | | |--- weights: [0.15, 0.00] class: 0
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
model.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income 5.996993e-01 Education_Graduate 1.366862e-01 CCAvg 7.930169e-02 Education_Professional 6.729348e-02 Family 6.641078e-02 Age 1.356784e-02 CD_Account_1 1.099955e-02 Experience 5.912055e-03 Mortgage 5.040802e-03 Securities_Account_1 4.716203e-03 ZIPCode_94 4.701509e-03 ZIPCode_91 3.426216e-03 ZIPCode_92 8.015507e-04 CreditCard_1 7.213956e-04 ZIPCode_93 7.213956e-04 ZIPCode_95 2.447958e-16 ZIPCode_96 0.000000e+00 Online_1 0.000000e+00 const 0.000000e+00
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
• According to the Decision Tree Model, Income is most important when predicting if a customer secures a Personal Loan.
from sklearn.model_selection import GridSearchCV
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1,class_weight = {0:.15,1:.85})
# Grid of parameters to choose from
parameters = {
'max_depth': np.arange(1,10),
'criterion': ['entropy','gini'],
'splitter': ['best','random'],
'min_impurity_decrease': [0.000001,0.00001,0.0001],
'max_features': ['log2','sqrt']
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, criterion='entropy',
max_depth=5, max_features='log2',
min_impurity_decrease=1e-06, random_state=1)
make_confusion_matrix(estimator,y_test)
get_recall_score(estimator)
Recall on training set : 0.8187311178247734 Recall on test set : 0.738255033557047
plt.figure(figsize=(15,10))
out = tree.plot_tree(estimator,feature_names=feature_names,filled=True,fontsize=9,node_ids=False,class_names=None)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(estimator,feature_names=feature_names,show_weights=True))
|--- Experience <= 12.50 | |--- Income <= 90.50 | | |--- CCAvg <= 3.00 | | | |--- weights: [105.45, 0.00] class: 0 | | |--- CCAvg > 3.00 | | | |--- Income <= 68.50 | | | | |--- weights: [0.75, 0.00] class: 0 | | | |--- Income > 68.50 | | | | |--- Education_Graduate <= 0.50 | | | | | |--- weights: [2.25, 0.85] class: 0 | | | | |--- Education_Graduate > 0.50 | | | | | |--- weights: [0.00, 2.55] class: 1 | |--- Income > 90.50 | | |--- Education_Graduate <= 0.50 | | | |--- Education_Professional <= 0.50 | | | | |--- Age <= 26.50 | | | | | |--- weights: [3.60, 0.00] class: 0 | | | | |--- Age > 26.50 | | | | | |--- weights: [20.10, 12.75] class: 0 | | | |--- Education_Professional > 0.50 | | | | |--- CCAvg <= 2.35 | | | | | |--- weights: [3.30, 8.50] class: 1 | | | | |--- CCAvg > 2.35 | | | | | |--- weights: [0.75, 35.70] class: 1 | | |--- Education_Graduate > 0.50 | | | |--- ZIPCode_94 <= 0.50 | | | | |--- CD_Account_1 <= 0.50 | | | | | |--- weights: [1.65, 19.55] class: 1 | | | | |--- CD_Account_1 > 0.50 | | | | | |--- weights: [0.00, 5.95] class: 1 | | | |--- ZIPCode_94 > 0.50 | | | | |--- Age <= 36.00 | | | | | |--- weights: [1.35, 11.05] class: 1 | | | | |--- Age > 36.00 | | | | | |--- weights: [0.00, 2.55] class: 1 |--- Experience > 12.50 | |--- CCAvg <= 2.95 | | |--- CD_Account_1 <= 0.50 | | | |--- Mortgage <= 239.00 | | | | |--- CreditCard_1 <= 0.50 | | | | | |--- weights: [197.85, 22.95] class: 0 | | | | |--- CreditCard_1 > 0.50 | | | | | |--- weights: [76.05, 5.10] class: 0 | | | |--- Mortgage > 239.00 | | | | |--- Experience <= 15.50 | | | | | |--- weights: [1.95, 0.00] class: 0 | | | | |--- Experience > 15.50 | | | | | |--- weights: [9.45, 8.50] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- Income <= 110.00 | | | | |--- weights: [9.75, 0.00] class: 0 | | | |--- Income > 110.00 | | | | |--- Age <= 61.50 | | | | | |--- weights: [0.30, 15.30] class: 1 | | | | |--- Age > 61.50 | | | | | |--- weights: [0.45, 1.70] class: 1 | |--- CCAvg > 2.95 | | |--- Income <= 81.50 | | | |--- Family <= 2.50 | | | | |--- Education_Professional <= 0.50 | | | | | |--- weights: [2.40, 0.85] class: 0 | | | | |--- Education_Professional > 0.50 | | | | | |--- weights: [2.25, 0.00] class: 0 | | | |--- Family > 2.50 | | | | |--- weights: [3.90, 0.00] class: 0 | | |--- Income > 81.50 | | | |--- CD_Account_1 <= 0.50 | | | | |--- Education_Professional <= 0.50 | | | | | |--- weights: [27.00, 47.60] class: 1 | | | | |--- Education_Professional > 0.50 | | | | | |--- weights: [2.85, 36.55] class: 1 | | | |--- CD_Account_1 > 0.50 | | | | |--- Age <= 61.50 | | | | | |--- weights: [1.95, 34.00] class: 1 | | | | |--- Age > 61.50 | | | | | |--- weights: [0.00, 9.35] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(estimator.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
#Here we will see that importance of features has increased
Imp CCAvg 0.385331 Income 0.368752 CD_Account_1 0.085863 Education_Professional 0.082745 Education_Graduate 0.028108 Mortgage 0.024982 Age 0.010420 Experience 0.009540 CreditCard_1 0.002296 Family 0.001741 ZIPCode_94 0.000221 Online_1 0.000000 Securities_Account_1 0.000000 const 0.000000 ZIPCode_96 0.000000 ZIPCode_95 0.000000 ZIPCode_92 0.000000 ZIPCode_91 0.000000 ZIPCode_93 0.000000
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
clf = DecisionTreeClassifier(random_state=1,class_weight = {0:0.15,1:0.85})
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.000000e+00 | -4.601043e-15 |
| 1 | 7.482671e-19 | -4.600295e-15 |
| 2 | 7.482671e-19 | -4.599547e-15 |
| 3 | 7.482671e-19 | -4.598798e-15 |
| 4 | 7.482671e-19 | -4.598050e-15 |
| 5 | 7.482671e-19 | -4.597302e-15 |
| 6 | 1.760629e-18 | -4.595541e-15 |
| 7 | 1.760629e-18 | -4.593781e-15 |
| 8 | 1.892676e-18 | -4.591888e-15 |
| 9 | 2.332833e-18 | -4.589555e-15 |
| 10 | 4.291532e-18 | -4.585264e-15 |
| 11 | 4.511611e-18 | -4.580752e-15 |
| 12 | 4.665666e-18 | -4.576086e-15 |
| 13 | 5.854090e-18 | -4.570232e-15 |
| 14 | 1.143528e-16 | -4.455879e-15 |
| 15 | 1.872164e-04 | 3.744328e-04 |
| 16 | 1.925655e-04 | 7.595638e-04 |
| 17 | 1.957616e-04 | 1.151087e-03 |
| 18 | 3.182679e-04 | 1.787623e-03 |
| 19 | 3.369896e-04 | 2.124612e-03 |
| 20 | 3.637348e-04 | 2.488347e-03 |
| 21 | 3.643130e-04 | 2.852660e-03 |
| 22 | 3.744328e-04 | 3.601526e-03 |
| 23 | 3.744328e-04 | 3.975959e-03 |
| 24 | 3.829427e-04 | 4.358901e-03 |
| 25 | 3.879017e-04 | 4.746803e-03 |
| 26 | 5.907779e-04 | 7.109915e-03 |
| 27 | 6.700377e-04 | 7.779952e-03 |
| 28 | 6.925559e-04 | 8.472508e-03 |
| 29 | 7.776682e-04 | 9.250176e-03 |
| 30 | 8.174649e-04 | 1.333750e-02 |
| 31 | 8.521714e-04 | 1.504184e-02 |
| 32 | 9.095010e-04 | 1.595134e-02 |
| 33 | 9.404360e-04 | 1.689178e-02 |
| 34 | 9.407728e-04 | 1.877333e-02 |
| 35 | 1.011155e-03 | 1.978448e-02 |
| 36 | 1.212606e-03 | 2.099709e-02 |
| 37 | 1.249613e-03 | 2.349631e-02 |
| 38 | 1.399934e-03 | 2.489625e-02 |
| 39 | 1.638043e-03 | 2.653429e-02 |
| 40 | 1.644638e-03 | 2.817893e-02 |
| 41 | 1.819971e-03 | 2.999890e-02 |
| 42 | 1.928896e-03 | 3.192780e-02 |
| 43 | 2.685352e-03 | 3.729850e-02 |
| 44 | 2.742431e-03 | 4.004093e-02 |
| 45 | 2.758322e-03 | 4.279925e-02 |
| 46 | 3.335999e-03 | 4.613525e-02 |
| 47 | 3.527226e-03 | 4.966248e-02 |
| 48 | 4.169566e-03 | 5.383204e-02 |
| 49 | 4.391862e-03 | 5.822390e-02 |
| 50 | 5.138280e-03 | 6.336218e-02 |
| 51 | 5.262465e-03 | 6.862465e-02 |
| 52 | 2.253222e-02 | 9.115687e-02 |
| 53 | 4.072766e-02 | 2.133399e-01 |
| 54 | 2.537957e-01 | 4.671356e-01 |
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
• We see that as effective alphas increase, so does total impurity of leaves
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha,class_weight = {0:0.15,1:0.85})
clf.fit(X_train, y_train)
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]))
Number of nodes in the last tree is: 1 with ccp_alpha: 0.25379571489480934
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1,figsize=(10,7))
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train=[]
for clf in clfs:
pred_train3=clf.predict(X_train)
values_train=metrics.recall_score(y_train,pred_train3)
recall_train.append(values_train)
recall_test=[]
for clf in clfs:
pred_test3=clf.predict(X_test)
values_test=metrics.recall_score(y_test,pred_test3)
recall_test.append(values_test)
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots(figsize=(15,5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(ccp_alphas, recall_train, marker='o', label="train",
drawstyle="steps-post",)
ax.plot(ccp_alphas, recall_test, marker='o', label="test",
drawstyle="steps-post")
ax.legend()
plt.show()
• Maximum value of Recall is at 0.04 alpha, but if we choose decision tree will only have a root node, instead we can choose alpha 0.005 retaining information for a higher recall.
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
DecisionTreeClassifier(ccp_alpha=0.005138280016554104,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
best_model.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.005138280016554104,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
make_confusion_matrix(best_model,y_test)
get_recall_score(best_model)
Recall on training set : 0.9909365558912386 Recall on test set : 0.9865771812080537
plt.figure(figsize=(5,5))
out = tree.plot_tree(best_model,feature_names=feature_names,filled=True,fontsize=9,node_ids=False,class_names=None)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
best_model2 = DecisionTreeClassifier(ccp_alpha=0.002,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
best_model2.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.002, class_weight={0: 0.15, 1: 0.85},
random_state=1)
make_confusion_matrix(best_model2,y_test)
get_recall_score(best_model2)
Recall on training set : 0.9879154078549849 Recall on test set : 0.9328859060402684
plt.figure(figsize=(15,10))
out = tree.plot_tree(best_model2,feature_names=feature_names,filled=True,fontsize=9,node_ids=False,class_names=None)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model2,feature_names=feature_names,show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account_1 <= 0.50 | | | |--- CCAvg <= 3.95 | | | | |--- Income <= 81.50 | | | | | |--- weights: [7.35, 2.55] class: 0 | | | | |--- Income > 81.50 | | | | | |--- weights: [4.35, 9.35] class: 1 | | | |--- CCAvg > 3.95 | | | | |--- weights: [6.75, 0.00] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- weights: [0.15, 6.80] class: 1 |--- Income > 98.50 | |--- Family <= 2.50 | | |--- Education_Professional <= 0.50 | | | |--- Education_Graduate <= 0.50 | | | | |--- Income <= 100.00 | | | | | |--- weights: [0.45, 1.70] class: 1 | | | | |--- Income > 100.00 | | | | | |--- weights: [67.20, 0.85] class: 0 | | | |--- Education_Graduate > 0.50 | | | | |--- Income <= 110.00 | | | | | |--- weights: [1.80, 0.00] class: 0 | | | | |--- Income > 110.00 | | | | | |--- weights: [1.05, 47.60] class: 1 | | |--- Education_Professional > 0.50 | | | |--- Income <= 116.50 | | | | |--- CCAvg <= 1.10 | | | | | |--- weights: [1.95, 0.00] class: 0 | | | | |--- CCAvg > 1.10 | | | | | |--- weights: [1.50, 6.80] class: 1 | | | |--- Income > 116.50 | | | | |--- weights: [0.00, 52.70] class: 1 | |--- Family > 2.50 | | |--- Income <= 113.50 | | | |--- CCAvg <= 2.75 | | | | |--- Income <= 106.50 | | | | | |--- weights: [3.90, 0.00] class: 0 | | | | |--- Income > 106.50 | | | | | |--- weights: [3.00, 5.10] class: 1 | | | |--- CCAvg > 2.75 | | | | |--- weights: [0.90, 11.90] class: 1 | | |--- Income > 113.50 | | | |--- weights: [0.90, 136.00] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(best_model2.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp Income 0.631552 Education_Graduate 0.146714 CCAvg 0.075895 Education_Professional 0.070443 Family 0.063589 CD_Account_1 0.011806 const 0.000000 ZIPCode_95 0.000000 Online_1 0.000000 Securities_Account_1 0.000000 ZIPCode_96 0.000000 ZIPCode_93 0.000000 ZIPCode_94 0.000000 Age 0.000000 ZIPCode_92 0.000000 ZIPCode_91 0.000000 Mortgage 0.000000 Experience 0.000000 CreditCard_1 0.000000
importances = best_model2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
comparison_frame = pd.DataFrame({'Model':['Initial decision tree model','Decision treee with hyperparameter tuning',
'Decision tree with post-pruning'], 'Train_Recall':[.81,0.99,0.98], 'Test_Recall':[0.73,0.98,0.93]})
comparison_frame
| Model | Train_Recall | Test_Recall | |
|---|---|---|---|
| 0 | Initial decision tree model | 0.81 | 0.73 |
| 1 | Decision treee with hyperparameter tuning | 0.99 | 0.98 |
| 2 | Decision tree with post-pruning | 0.98 | 0.93 |